In this part of analysis, we want to know how STEM policy influence graduated students. We first separate our data into 2 categories: one category includes citizens and the other category includes non-citizens. Then, we further separate our data into STEM-major data and non-STEM-major data. By doing the separations, we can easily do comparisons between influences of STEM policy to citizens and influences of STEM policy to non-citizens and between employment status of STEM majors and employment status of non-STEM majors.
Is it true that graduated students with STEM degrees can easily find jobs?
Do levels of degrees influence employment rate of STEM students? Do levels of degrees influence employment rate of non-STEM students?
Which fields will STEM graduated students enter and which fields will non-STEM graduated students enter?
Before we start to analyze data, we should clean and preprocess data. Following R codes include the specific cleaning and preprocessing procedures. (Note: This part is done by Yueqi Zhang)
library(data.table)
library(dplyr)
cols<-c("SERIALNO","SPORDER","ST","AGEP","CIT","NWLK","SCH","SCHL","WKL","WRK","FOD1P","OCCP")
data14a<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus/ss14pusa.csv",select=cols)
Read 0.0% of 1611956 rows
Read 19.2% of 1611956 rows
Read 37.8% of 1611956 rows
Read 56.5% of 1611956 rows
Read 75.7% of 1611956 rows
Read 94.9% of 1611956 rows
Read 1611956 rows and 12 (of 284) columns from 1.427 GB file in 00:00:09
data14b<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus/ss14pusb.csv",select=cols)
Read 0.0% of 1520654 rows
Read 19.7% of 1520654 rows
Read 39.5% of 1520654 rows
Read 58.5% of 1520654 rows
Read 78.9% of 1520654 rows
Read 98.6% of 1520654 rows
Read 1520654 rows and 12 (of 284) columns from 1.346 GB file in 00:00:09
data14<-rbind(data14a,data14b)
data14<-filter(data14,FOD1P!="NA")
stem<-matrix(rep(0,dim(data14)[1]),nrow=dim(data14)[1])
stem.field<-c(1103,1104,1105,1106,1301,1401,2100,2101,2102,2105,2106,2107,2400,2401,2402,2403,2404,2407,2408,2409,2410,2411,2412,2413,2414,2415,2416,2417,2418,2419,2499,2500,2501,2502,2503,2504,2599,3600,3601,3602,3603,3604,3605,3606,3607,3608,3609,3611,3699,3700,3701,3702,3801,4002,4005,4006,5000,5001,5002,5003,5004,5005,5006,5007,5008,5102,5206,5901,6105,6202,6212)
l<-length(stem.field)
order.fod<-c()
for(i in 1:l)
{
order.fod<-c(order.fod,which(data14$FOD1P==stem.field[i]))
}
order.fod<-sort(order.fod)
stem[order.fod]<-1
colnames(stem)<-"STEM"
data14<-cbind(data14,stem)
data14$CIT[which(data14$CIT==5)]<-0
data14$CIT[which(data14$CIT==4)]<-1
data14$CIT[which(data14$CIT==3)]<-1
data14$CIT[which(data14$CIT==2)]<-1
data14<-subset(data14,data14$SCH==1)
data14<-filter(data14,data14$SCHL>20)
data14$OCCP[which(data14$NWLK==1)]<-"NA"
data14$OCCP[which(data14$WKL==2)]<-"NA"
data14$OCCP[which(data14$WRK==2)]<-"NA"
data14$OCCP[which(data14$OCCP==9920)]<-"NA"
occp.type<-matrix(rep(0,dim(data14)[1]),nrow=dim(data14)[1])
occp.mgr<-c(0010,0020,0040,0050,0060,0100,0120,0135,0136,0137,0140,0150,0160,0205,0220,0230,0300,0310,0330,0340,0350,0360,0410,0420,0425,0430)
for(i in 1:length(occp.mgr))
{
occp.type[which(data14$OCCP==occp.mgr[i])]<-"MGR"
}
occp.bus<-c(0500,0510,0520,0530,0540,0565,0600,0630,0640,0650,0700,0710,0725,0726,0735,0740)
for(i in 1:length(occp.bus))
{
occp.type[which(data14$OCCP==occp.bus[i])]<-"BUS"
}
occp.fin<-c(0800,0810,0820,0830,0840,0850,0860,0900,0910,0930,0940,0950)
for(i in 1:length(occp.fin))
{
occp.type[which(data14$OCCP==occp.fin[i])]<-"FIN"
}
occp.cmm<-c(1005,1006,1007,1010,1020,1030,1050,1060,1105,1106,1107,1200,1220,1240)
for(i in 1:length(occp.cmm))
{
occp.type[which(data14$OCCP==occp.cmm[i])]<-"CMM"
}
occp.eng<-c(1300,1310,1320,1340,1350,1360,1400,1410,1420,1430,1440,1450,1460,1520,1530,1540,1550,1560)
for(i in 1:length(occp.eng))
{
occp.type[which(data14$OCCP==occp.eng[i])]<-"ENG"
}
occp.sci<-c(1600,1610,1640,1650,1700,1710,1720,1740,1760,1800,1820,1840,1860,1900,1910,1920,1930,1965)
for(i in 1:length(occp.sci))
{
occp.type[which(data14$OCCP==occp.sci[i])]<-"SCI"
}
occp.cms<-c(2000,2010,2015,2016,2025,2040,2050,2060)
for(i in 1:length(occp.cms))
{
occp.type[which(data14$OCCP==occp.cms[i])]<-"CMS"
}
occp.lgl<-c(2100,2105,2145,2160)
for(i in 1:length(occp.lgl))
{
occp.type[which(data14$OCCP==occp.lgl[i])]<-"LGL"
}
occp.edu<-c(2200,2300,2310,2320,2330,2340,2440,2540,2550)
for(i in 1:length(occp.edu))
{
occp.type[which(data14$OCCP==occp.edu[i])]<-"EDU"
}
occp.ent<-c(2600,2630,2700,2710,2720,2740,2750,2760,2800,2810,2825,2830,2840,2850,2860,2900,2910,2920)
for(i in 1:length(occp.ent))
{
occp.type[which(data14$OCCP==occp.ent[i])]<-"ENT"
}
occp.med<-c(3000,3010,3030,3040,3050,3060,3110,3120,3140,3150,3160,3200,3210,3220,3230,3245,3250,3255,3256,3258,3260,3300,3310,3320,3400,3420,3500,3510,3520,3535,3540)
for(i in 1:length(occp.med))
{
occp.type[which(data14$OCCP==occp.med[i])]<-"MED"
}
occp.hls<-c(3600,3610,3620,3630,3640,3645,3646,3647,3648,3649,3655)
for(i in 1:length(occp.hls))
{
occp.type[which(data14$OCCP==occp.hls[i])]<-"HLS"
}
occp.prt<-c(3700,3710,3720,3730,3740,3750,3800,3820,3840,3850,3900,3910,3930,3940,3945,3955)
for(i in 1:length(occp.prt))
{
occp.type[which(data14$OCCP==occp.prt[i])]<-"PRT"
}
occp.eat<-c(4000,4010,4020,4030,4040,4050,4060,4110,4120,4130,4140,4150)
for(i in 1:length(occp.eat))
{
occp.type[which(data14$OCCP==occp.eat[i])]<-"EAT"
}
occp.cln<-c(4200,4210,4220,4230,4240,4250)
for(i in 1:length(occp.cln))
{
occp.type[which(data14$OCCP==occp.cln[i])]<-"CLN"
}
occp.prs<-c(4300,4320,4340,4350,4400,4410,4420,4430,4460,4465,4500,4510,4520,4530,4540,4600,4610,4620,4640,4650)
for(i in 1:length(occp.prs))
{
occp.type[which(data14$OCCP==occp.prs[i])]<-"PRS"
}
occp.sal<-c(4700,4710,4720,4740,4750,4760,4800,4810,4820,4830,4840,4850,4900,4920,4930,4940,4950,4965)
for(i in 1:length(occp.sal))
{
occp.type[which(data14$OCCP==occp.sal[i])]<-"SAL"
}
occp.off<-c(5000,5010,5020,5030,5100,5110,5120,5130,5140,5150,5160,5165,5200,5220,5230,5240,5250,5260,5300,5310,5320,5330,5340,5350,5360,5400,5410,5420,5500,5510,5520,5530,5540,5550,5560,5600,5610,5620,5630,5700,5800,5810,5820,5840,5850,5860,5900,5910,5920,5940)
for(i in 1:length(occp.off))
{
occp.type[which(data14$OCCP==occp.off[i])]<-"OFF"
}
occp.fff<-c(6005,6010,6040,6050,6100,6120,6130)
for(i in 1:length(occp.fff))
{
occp.type[which(data14$OCCP==occp.fff[i])]<-"FFF"
}
occp.con<-c(6200,6210,6220,6230,6240,6250,6260,6300,6320,6330,6355,6360,6400,6420,6440,6460,6515,6520,6530,6600,6660,6700,6710,6720,6730,6740,6765)
for(i in 1:length(occp.con))
{
occp.type[which(data14$OCCP==occp.con[i])]<-"CON"
}
occp.ext<-c(6800,6820,6830,6840,6940)
for(i in 1:length(occp.ext))
{
occp.type[which(data14$OCCP==occp.ext[i])]<-"EXT"
}
occp.rpr<-c(7000,7010,7020,7030,7040,7100,7110,7120,7130,7140,7150,7160,7200,7210,7220,7240,7260,7300,7315,7320,7330,7340,7350,7360,7410,7420,7430,7510,7540,7560,7610,7630)
for(i in 1:length(occp.rpr))
{
occp.type[which(data14$OCCP==occp.rpr[i])]<-"RPR"
}
occp.prd<-c(7700,7710,7720,7730,7740,7750,7800,7810,7830,7840,7850,7855,7900,7920,7930,7940,7950,8030,8040,8100,8130,8140,8220,8250,8255,8256,8300,8310,8320,8330,8350,8400,8410,8420,8450,8460,8500,8510,8530,8540,8550,8600,8610,8620,8630,8640,8650,8710,8720,8730,8740,8750,8760,8800,8810,8830,8850,8910,8920,8930,8940,8950,8965)
for(i in 1:length(occp.prd))
{
occp.type[which(data14$OCCP==occp.prd[i])]<-"PRD"
}
occp.trn<-c(9000,9030,9040,9050,9110,9120,9130,9140,9150,9200,9240,9260,9300,9310,9350,9360,9410,9415,9420,9510,9520,9560,9600,9610,9620,9630,9640,9650,9720,9750)
for(i in 1:length(occp.trn))
{
occp.type[which(data14$OCCP==occp.trn[i])]<-"TRN"
}
occp.mil<-c(9800,9810,9820,9830,9920)
for(i in 1:length(occp.mil))
{
occp.type[which(data14$OCCP==occp.mil[i])]<-"MIL"
}
colnames(occp.type)<-"OCCP.TYPE"
data14<-cbind(data14,occp.type)
write.csv(data14,file="data2014.csv",row.names = FALSE)
data.14<-fread("data2014.csv",select=c("ST","AGEP","CIT","SCHL","FOD1P","OCCP","STEM","OCCP.TYPE"))
write.csv(data.14,file="fndata2014.csv",row.names = FALSE)
data.14.noncit<-subset(data.14,data.14$CIT==0)
data.14.cit<-subset(data.14,data.14$CIT!=0)
write.csv(data.14.cit,file="data2014_cit.csv",row.names=FALSE)
write.csv(data.14.noncit,file="data2014_noncit.csv",row.names=FALSE)
After cleaning data, we can now do our analysis. First, I should set the work directory and load the data sets that I need into R workspace. Then, I need to separate each data set into two parts according to citizenship status.
setwd("/Users/YaqingXie/Desktop/Applied Data Science/Fall2016-proj1-grp15/data")
The working directory was changed to /Users/YaqingXie/Desktop/Applied Data Science/Fall2016-proj1-grp15/data inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
pus11<-read.csv("fndata2011.csv")
pus12<-read.csv("fndata2012.csv")
pus13<-read.csv("fndata2013.csv")
pus14<-read.csv("fndata2014.csv")
pus11cit<-pus11[pus11$CIT==1,]
pus11ncit<-pus11[pus11$CIT==0,]
pus12cit<-pus12[pus12$CIT==1,]
pus12ncit<-pus12[pus12$CIT==0,]
pus13cit<-pus13[pus13$CIT==1,]
pus13ncit<-pus13[pus13$CIT==0,]
pus14cit<-pus14[pus14$CIT==1,]
pus14ncit<-pus14[pus14$CIT==0,]
Now, for the convenience of further study, we can have a look at a small part of our data in a table.
library(DT)
package ‘DT’ was built under R version 3.2.5
datatable(head(pus11cit,50),options=list(scrollX=T,pageLength=10))
Note: the specification for S3 class “AsIs” in package ‘jsonlite’ seems equivalent to one from package ‘DBI’: not turning on duplicate class definitions for this class.
Following R codes include our analyzing procedures for our 1st concern.
# In this part, I am going to check the 4-year employment rates in each category.
# First, I will eliminate the rows with age greater than or equal to 66.
pus11citstemy<-pus11cit[pus11cit$STEM==1&pus11cit$AGEP<66,]
pus11citnstemy<-pus11cit[pus11cit$STEM==0&pus11cit$AGEP<66,]
pus11ncitstemy<-pus11ncit[pus11ncit$STEM==1&pus11ncit$AGEP<66,]
pus11ncitnstemy<-pus11ncit[pus11ncit$STEM==0&pus11ncit$AGEP<66,]
pus12citstemy<-pus12cit[pus12cit$STEM==1&pus12cit$AGEP<66,]
pus12citnstemy<-pus12cit[pus12cit$STEM==0&pus12cit$AGEP<66,]
pus12ncitstemy<-pus12ncit[pus12ncit$STEM==1&pus12ncit$AGEP<66,]
pus12ncitnstemy<-pus12ncit[pus12ncit$STEM==0&pus12ncit$AGEP<66,]
pus13citstemy<-pus13cit[pus13cit$STEM==1&pus13cit$AGEP<66,]
pus13citnstemy<-pus13cit[pus13cit$STEM==0&pus13cit$AGEP<66,]
pus13ncitstemy<-pus13ncit[pus13ncit$STEM==1&pus13ncit$AGEP<66,]
pus13ncitnstemy<-pus13ncit[pus13ncit$STEM==0&pus13ncit$AGEP<66,]
pus14citstemy<-pus14cit[pus14cit$STEM==1&pus14cit$AGEP<66,]
pus14citnstemy<-pus14cit[pus14cit$STEM==0&pus14cit$AGEP<66,]
pus14ncitstemy<-pus14ncit[pus14ncit$STEM==1&pus14ncit$AGEP<66,]
pus14ncitnstemy<-pus14ncit[pus14ncit$STEM==0&pus14ncit$AGEP<66,]
# Then, I will calculate the 4-year employment rates in each category.
citstememp<-c(nrow(pus11citstemy[!is.na(pus11citstemy$OCCP),])/nrow(pus11citstemy),nrow(pus12citstemy[!is.na(pus12citstemy$OCCP),])/nrow(pus12citstemy),nrow(pus13citstemy[!is.na(pus13citstemy$OCCP),])/nrow(pus13citstemy),nrow(pus14citstemy[!is.na(pus14citstemy$OCCP),])/nrow(pus14citstemy))
citnstememp<-c(nrow(pus11citnstemy[!is.na(pus11citnstemy$OCCP),])/nrow(pus11citnstemy),nrow(pus12citnstemy[!is.na(pus12citnstemy$OCCP),])/nrow(pus12citnstemy),nrow(pus13citnstemy[!is.na(pus13citnstemy$OCCP),])/nrow(pus13citnstemy),nrow(pus14citnstemy[!is.na(pus14citnstemy$OCCP),])/nrow(pus14citnstemy))
ncitstememp<-c(nrow(pus11ncitstemy[!is.na(pus11ncitstemy$OCCP),])/nrow(pus11ncitstemy),nrow(pus12ncitstemy[!is.na(pus12ncitstemy$OCCP),])/nrow(pus12ncitstemy),nrow(pus13ncitstemy[!is.na(pus13ncitstemy$OCCP),])/nrow(pus13ncitstemy),nrow(pus14ncitstemy[!is.na(pus14ncitstemy$OCCP),])/nrow(pus14ncitstemy))
ncitnstememp<-c(nrow(pus11ncitnstemy[!is.na(pus11ncitnstemy$OCCP),])/nrow(pus11ncitnstemy),nrow(pus12ncitnstemy[!is.na(pus12ncitnstemy$OCCP),])/nrow(pus12ncitnstemy),nrow(pus13ncitnstemy[!is.na(pus13ncitnstemy$OCCP),])/nrow(pus13ncitnstemy),nrow(pus14ncitnstemy[!is.na(pus14ncitnstemy$OCCP),])/nrow(pus14ncitnstemy))
# Finally, I will visualize the 4-year employment rates in each category.
empmat<-rbind(citstememp,citnstememp,ncitstememp,ncitnstememp)
colnames(empmat)<-c("2011","2012","2013","2014")
rownames(empmat)<-c("citizen STEM","citizen non-STEM","non-citizen STEM","non-citizen non-STEM")
empdf<-data.frame(year=factor(rep(c("2011","2012","2013","2014"),each=4),levels=c("2011","2012","2013","2014")),status=factor(rep(c("citizen STEM","citizen non-STEM","non-citizen STEM","non-citizen non-STEM"),4)),rate=c(empmat[,1],empmat[,2],empmat[,3],empmat[,4]))
library(ggplot2)
ggplot(data=empdf,aes(x=year,y=rate,fill=status))+geom_bar(stat="identity",position=position_dodge())+geom_text(aes(label=round(rate,2)),position=position_dodge(width=0.9))
After visualization, we finish our analyzing process of our 1st concern.
Following R codes include our analyzing procedures for our 2nd concern.
# In this part, I am going to calculate employment rates of different degrees in each category. The data set that I use here is the 2014 data set.
# First, I should further separate the data sets using the variable SCHL (degree level).
pus14citstemyb<-pus14citstemy[pus14citstemy$SCHL==21,]
pus14citstemym<-pus14citstemy[pus14citstemy$SCHL==22,]
pus14citstemyp<-pus14citstemy[pus14citstemy$SCHL==23,]
pus14citstemyd<-pus14citstemy[pus14citstemy$SCHL==24,]
pus14citnstemyb<-pus14citnstemy[pus14citnstemy$SCHL==21,]
pus14citnstemym<-pus14citnstemy[pus14citnstemy$SCHL==22,]
pus14citnstemyp<-pus14citnstemy[pus14citnstemy$SCHL==23,]
pus14citnstemyd<-pus14citnstemy[pus14citnstemy$SCHL==24,]
pus14ncitstemyb<-pus14ncitstemy[pus14ncitstemy$SCHL==21,]
pus14ncitstemym<-pus14ncitstemy[pus14ncitstemy$SCHL==22,]
pus14ncitstemyp<-pus14ncitstemy[pus14ncitstemy$SCHL==23,]
pus14ncitstemyd<-pus14ncitstemy[pus14ncitstemy$SCHL==24,]
pus14ncitnstemyb<-pus14ncitnstemy[pus14ncitnstemy$SCHL==21,]
pus14ncitnstemym<-pus14ncitnstemy[pus14ncitnstemy$SCHL==22,]
pus14ncitnstemyp<-pus14ncitnstemy[pus14ncitnstemy$SCHL==23,]
pus14ncitnstemyd<-pus14ncitnstemy[pus14ncitnstemy$SCHL==24,]
# Then, I will calculate employment rates of different degrees in each category.
citstemdeg<-c(nrow(pus14citstemyb[!is.na(pus14citstemyb$OCCP),])/nrow(pus14citstemyb),nrow(pus14citstemym[!is.na(pus14citstemym$OCCP),])/nrow(pus14citstemym),nrow(pus14citstemyp[!is.na(pus14citstemyp$OCCP),])/nrow(pus14citstemyp),nrow(pus14citstemyd[!is.na(pus14citstemyd$OCCP),])/nrow(pus14citstemyd))
citnstemdeg<-c(nrow(pus14citnstemyb[!is.na(pus14citnstemyb$OCCP),])/nrow(pus14citnstemyb),nrow(pus14citnstemym[!is.na(pus14citnstemym$OCCP),])/nrow(pus14citnstemym),nrow(pus14citnstemyp[!is.na(pus14citnstemyp$OCCP),])/nrow(pus14citnstemyp),nrow(pus14citnstemyd[!is.na(pus14citnstemyd$OCCP),])/nrow(pus14citnstemyd))
ncitstemdeg<-c(nrow(pus14ncitstemyb[!is.na(pus14ncitstemyb$OCCP),])/nrow(pus14ncitstemyb),nrow(pus14ncitstemym[!is.na(pus14ncitstemym$OCCP),])/nrow(pus14ncitstemym),nrow(pus14ncitstemyp[!is.na(pus14ncitstemyp$OCCP),])/nrow(pus14ncitstemyp),nrow(pus14ncitstemyd[!is.na(pus14ncitstemyd$OCCP),])/nrow(pus14ncitstemyd))
ncitnstemdeg<-c(nrow(pus14ncitnstemyb[!is.na(pus14ncitnstemyb$OCCP),])/nrow(pus14ncitnstemyb),nrow(pus14ncitnstemym[!is.na(pus14ncitnstemym$OCCP),])/nrow(pus14ncitnstemym),nrow(pus14ncitnstemyp[!is.na(pus14ncitnstemyp$OCCP),])/nrow(pus14ncitnstemyp),nrow(pus14ncitnstemyd[!is.na(pus14ncitnstemyd$OCCP),])/nrow(pus14ncitnstemyd))
# Finally, I will visualize employment rates of different degrees in each category.
degmat<-rbind(citstemdeg,citnstemdeg,ncitstemdeg,ncitnstemdeg)
colnames(degmat)<-c("bachelor","master","professional","doctorate")
rownames(degmat)<-c("citizen STEM","citizen non-STEM","non-citizen STEM","non-citizen non-STEM")
degdf<-data.frame(degree=factor(rep(c("bachelor","master","professional","doctorate"),each=4),levels=c("bachelor","master","professional","doctorate")),status=factor(rep(c("citizen STEM","citizen non-STEM","non-citizen STEM","non-citizen non-STEM"),4)),rate=c(degmat[,1],degmat[,2],degmat[,3],degmat[,4]))
ggplot(data=degdf,aes(x=degree,y=rate,fill=status))+geom_bar(stat="identity",position=position_dodge())+geom_text(aes(label=round(rate,2)),position=position_dodge(width=0.9))
After visualization, we finish our analyzing process of our 2nd concern.
Following R codes include our analyzing procedures for our 3rd concern.
First, I will do Sankey plot.
# In order to get top majors in both stem fields and non-stem fields, I need to obtain the frequency of each major in each data set and sort majors by frequencies for each data set.
topmajors14cit<-as.data.frame(sort(table(pus14cit$FOD1P),decreasing=T))
topmajors14ncit<-as.data.frame(sort(table(pus14ncit$FOD1P),decreasing=T))
colnames(topmajors14cit)<-"frequency"
colnames(topmajors14ncit)<-"frequency"
# Now, I generate stem/non-stem vectors for major vectors.
majorstemv14cit<-vector()
majorstemv14ncit<-vector()
for(i in 1:nrow(topmajors14cit)){
majorstemv14cit[i]<-(pus14cit[pus14cit$FOD1P==as.numeric(rownames(topmajors14cit)[i]),which(colnames(pus14cit)=="STEM")])[1]
}
for(i in 1:nrow(topmajors14ncit)){
majorstemv14ncit[i]<-(pus14ncit[pus14ncit$FOD1P==as.numeric(rownames(topmajors14ncit)[i]),which(colnames(pus14ncit)=="STEM")])[1]
}
# We need to generate major vs stem matrix now.
library(dplyr)
ms14cit<-topmajors14cit%>%mutate(major=rownames(topmajors14cit))
ms14cit<-ms14cit%>%mutate(stem=majorstemv14cit)
ms14cit<-ms14cit%>%mutate(proportion=frequency/sum(frequency))
ms14ncit<-topmajors14ncit%>%mutate(major=rownames(topmajors14ncit))
ms14ncit<-ms14ncit%>%mutate(stem=majorstemv14ncit)
ms14ncit<-ms14ncit%>%mutate(proportion=frequency/sum(frequency))
# Now, I will preprocess data for Sankey plot.
ms14citallns<-ms14cit%>%filter(stem==0)
ms14citalls<-ms14cit%>%filter(stem==1)
ms14cittopns<-rbind(ms14citallns[1:20,],c(sum(ms14citallns[-(1:20),1]),"other",0,sum(ms14citallns[-(1:20),4])))
ms14cittops<-rbind(ms14citalls[1:20,],c(sum(ms14citalls[-(1:20),1]),"other",1,sum(ms14citalls[-(1:20),4])))
ms14cittop<-rbind(ms14cittops,ms14cittopns)
ms14ncitallns<-ms14ncit%>%filter(stem==0)
ms14ncitalls<-ms14ncit%>%filter(stem==1)
ms14ncittopns<-rbind(ms14ncitallns[1:20,],c(sum(ms14ncitallns[-(1:20),1]),"other",0,sum(ms14ncitallns[-(1:20),4])))
ms14ncittops<-rbind(ms14ncitalls[1:20,],c(sum(ms14ncitalls[-(1:20),1]),"other",1,sum(ms14ncitalls[-(1:20),4])))
ms14ncittop<-rbind(ms14ncittops,ms14ncittopns)
# The following function is used to transfer major codes into major names
codename<-function(code){
name<-"other"
if(code==1100) name<-"general agriculture"
if(code==1101) name<-"agriculture production and management"
if(code==1102) name<-"agricultural economics"
if(code==1103) name<-"animal sciences"
if(code==1104) name<-"food science"
if(code==1105) name<-"plant science and agronomy"
if(code==1106) name<-"soil science"
if(code==1199) name<-"miscellaneous agriculture"
if(code==1301) name<-"environmental science"
if(code==1302) name<-"forestry"
if(code==1303) name<-"natural resources management"
if(code==1401) name<-"architecture"
if(code==1501) name<-"area ethnic and civilization"
if(code==1901) name<-"communications"
if(code==1902) name<-"journalism"
if(code==1903) name<-"mass media"
if(code==1904) name<-"advertising and public relations"
if(code==2001) name<-"communication technologies"
if(code==2100) name<-"computer and information systems"
if(code==2101) name<-"computer programming and data processing"
if(code==2102) name<-"computer science"
if(code==2105) name<-"information sciences"
if(code==2106) name<-"computer administration management and security"
if(code==2107) name<-"computer networking and telecommunications"
if(code==2201) name<-"cosmetology services and culinary arts"
if(code==2300) name<-"general education"
if(code==2301) name<-"educational administration and supervision"
if(code==2303) name<-"school student counseling"
if(code==2304) name<-"elementary education"
if(code==2305) name<-"mathematics teacher education"
if(code==2306) name<-"physical and health education teaching"
if(code==2307) name<-"early childhood education"
if(code==2308) name<-"science and computer teacher education"
if(code==2309) name<-"secondary teacher education"
if(code==2310) name<-"special needs education"
if(code==2311) name<-"social science or history teacher education"
if(code==2312) name<-"teacher education: multiple levels"
if(code==2313) name<-"language and drama education"
if(code==2314) name<-"art and music education"
if(code==2399) name<-"miscellaneous education"
if(code==2400) name<-"general engineering"
if(code==2401) name<-"aerospace engineering"
if(code==2402) name<-"biological engineering"
if(code==2403) name<-"architectural engineering"
if(code==2404) name<-"biomedical engineering"
if(code==2405) name<-"chemical engineering"
if(code==2406) name<-"civil engineering"
if(code==2407) name<-"computer engineering"
if(code==2408) name<-"electrical engineering"
if(code==2409) name<-"engineering mechanics physics and science"
if(code==2410) name<-"environmental engineering"
if(code==2411) name<-"geological and geophysical engineering"
if(code==2412) name<-"industrial and manufacturing engineering"
if(code==2413) name<-"materials engneering and materials science"
if(code==2414) name<-"mechanical engineering"
if(code==2415) name<-"metallurgical engineering"
if(code==2416) name<-"mining and mineral engineering"
if(code==2417) name<-"naval architecture and marine engineering"
if(code==2418) name<-"nuclear engineering"
if(code==2419) name<-"petroleum engineering"
if(code==2499) name<-"miscellaneous engineering"
if(code==2500) name<-"engineering technologies"
if(code==2501) name<-"engineering and industrial management"
if(code==2502) name<-"electrical engineering technology"
if(code==2503) name<-"industrial production technologies"
if(code==2504) name<-"mechanical engineering related technologies"
if(code==2599) name<-"miscellaneous engineering technologies"
if(code==2601) name<-"linguistics and comparative language and literature"
if(code==2602) name<-"french german latin and other common foreign language studies"
if(code==2603) name<-"other foreign languages"
if(code==2901) name<-"family and consumer sciences"
if(code==3201) name<-"court reporting"
if(code==3202) name<-"pre-law and legal studies"
if(code==3301) name<-"english language and literature"
if(code==3302) name<-"composition and rhetoric"
if(code==3401) name<-"liberal arts"
if(code==3402) name<-"humanities"
if(code==3501) name<-"library science"
if(code==3600) name<-"biology"
if(code==3601) name<-"biochemical sciences"
if(code==3602) name<-"botany"
if(code==3603) name<-"molecular biology"
if(code==3604) name<-"ecology"
if(code==3605) name<-"genetics"
if(code==3606) name<-"microbiology"
if(code==3607) name<-"pharmacology"
if(code==3608) name<-"physiology"
if(code==3609) name<-"zoology"
if(code==3611) name<-"neuroscience"
if(code==3699) name<-"miscellaneous biology"
if(code==3700) name<-"mathematics"
if(code==3701) name<-"applied mathematics"
if(code==3702) name<-"statistics and decision science"
if(code==3801) name<-"military technologies"
if(code==4000) name<-"multi/interdisciplinary studies"
if(code==4001) name<-"intercultural and international studies"
if(code==4002) name<-"nutrition sciences"
if(code==4005) name<-"mathematics and computer science"
if(code==4006) name<-"cognitive science and biopsychology"
if(code==4007) name<-"interdisciplinary social sciences"
if(code==4101) name<-"physical fitness parks recreation and leisure"
if(code==4801) name<-"philosophy and religious studies"
if(code==4901) name<-"theology and religious vocations"
if(code==5000) name<-"physical sciences"
if(code==5001) name<-"astronomy and astrophysics"
if(code==5002) name<-"atmospheric sciences and meteorology"
if(code==5003) name<-"chemistry"
if(code==5004) name<-"geology and earth science"
if(code==5005) name<-"geosciences"
if(code==5006) name<-"oceanography"
if(code==5007) name<-"physics"
if(code==5008) name<-"material science"
if(code==5098) name<-"multi-disciplinary or general science"
if(code==5102) name<-"nuclear, industrial radiology, and biological technologies"
if(code==5200) name<-"psychology"
if(code==5201) name<-"educational psychology"
if(code==5202) name<-"clinical psychology"
if(code==5203) name<-"counseling psychology"
if(code==5205) name<-"indsutrial and organizational psychology"
if(code==5206) name<-"social psychology"
if(code==5299) name<-"miscellaneous psychology"
if(code==5301) name<-"criminal justice and fire protection"
if(code==5401) name<-"public administration"
if(code==5402) name<-"public policy"
if(code==5403) name<-"human services and community organization"
if(code==5404) name<-"social work"
if(code==5500) name<-"general social sciences"
if(code==5501) name<-"economics"
if(code==5502) name<-"anthropology and archeology"
if(code==5503) name<-"criminology"
if(code==5504) name<-"geography"
if(code==5505) name<-"internatinal relations"
if(code==5506) name<-"political science and government"
if(code==5507) name<-"sociology"
if(code==5599) name<-"miscellaneous social sciences"
if(code==5601) name<-"construction services"
if(code==5701) name<-"electrical, mechanical and precision technologies and production"
if(code==5901) name<-"transportation sciences and technologies"
if(code==6000) name<-"fine arts"
if(code==6001) name<-"drama and theater arts"
if(code==6002) name<-"music"
if(code==6003) name<-"visual and performing arts"
if(code==6004) name<-"commercial arts and graphic design"
if(code==6005) name<-"film video and photographic arts"
if(code==6006) name<-"art history and criticism"
if(code==6007) name<-"studio arts"
if(code==6099) name<-"miscellaneous fine arts"
if(code==6100) name<-"general medical and health services"
if(code==6102) name<-"communication disorders sciences and services"
if(code==6103) name<-"health and medical administrative services"
if(code==6104) name<-"medical assisting services"
if(code==6105) name<-"medical technologies technicians"
if(code==6106) name<-"health and medical preparatory programs"
if(code==6107) name<-"nursing"
if(code==6108) name<-"pharmacy pharmaceutical sciences and administration"
if(code==6109) name<-"treatment therapy professions"
if(code==6110) name<-"community and public health"
if(code==6199) name<-"miscellaneous health medical professions"
if(code==6200) name<-"general business"
if(code==6201) name<-"accounting"
if(code==6202) name<-"actuarial science"
if(code==6203) name<-"business management and administration"
if(code==6204) name<-"operations logistics and e-commerce"
if(code==6205) name<-"business economics"
if(code==6206) name<-"marketing and marketing research"
if(code==6207) name<-"finance"
if(code==6209) name<-"human resources and personnel management"
if(code==6210) name<-"international business"
if(code==6211) name<-"hospitality management"
if(code==6212) name<-"management information systems and statistics"
if(code==6299) name<-"miscellaneous business and medical administration"
if(code==6402) name<-"history"
if(code==6403) name<-"united states history"
return(name)
}
# Then I will transfer major codes to major names and stem code to stem/non-stem.
for(i in 1:nrow(ms14cittop)){
ms14cittop[i,which(colnames(ms14cittop)=="major")]<-codename(ms14cittop$major[i])
}
for(i in 1:nrow(ms14ncittop)){
ms14ncittop[i,which(colnames(ms14ncittop)=="major")]<-codename(ms14ncittop$major[i])
}
ms14cittop[ms14cittop$stem==1,which(colnames(ms14cittop)=="stem")]<-"stem"
ms14cittop[ms14cittop$stem==0,which(colnames(ms14cittop)=="stem")]<-"non-stem"
ms14ncittop[ms14ncittop$stem==1,which(colnames(ms14ncittop)=="stem")]<-"stem"
ms14ncittop[ms14ncittop$stem==0,which(colnames(ms14ncittop)=="stem")]<-"non-stem"
# ms14cittop, ms14ncittop will be used later.
# Now, I need to do stem vs occupation type.
stemwork14cit<-pus14cit%>%filter(STEM==1)
stemwork14ncit<-pus14ncit%>%filter(STEM==1)
nstemwork14cit<-pus14cit%>%filter(STEM==0)
nstemwork14ncit<-pus14ncit%>%filter(STEM==0)
topstemwork14cit<-as.data.frame(sort(table(stemwork14cit$OCCP.TYPE),decreasing=T))
topstemwork14ncit<-as.data.frame(sort(table(stemwork14ncit$OCCP.TYPE),decreasing=T))
topnstemwork14cit<-as.data.frame(sort(table(nstemwork14cit$OCCP.TYPE),decreasing=T))
topnstemwork14ncit<-as.data.frame(sort(table(nstemwork14ncit$OCCP.TYPE),decreasing=T))
topstemwork14cit<-cbind(topstemwork14cit,c("stem"))
topstemwork14ncit<-cbind(topstemwork14ncit,c("stem"))
topnstemwork14cit<-cbind(topnstemwork14cit,c("non-stem"))
topnstemwork14ncit<-cbind(topnstemwork14ncit,c("non-stem"))
colnames(topstemwork14cit)<-c("frequency","stem")
colnames(topstemwork14ncit)<-c("frequency","stem")
colnames(topnstemwork14cit)<-c("frequency","stem")
colnames(topnstemwork14ncit)<-c("frequency","stem")
topstemwork14cit<-topstemwork14cit%>%mutate(work=rownames(topstemwork14cit))
topstemwork14ncit<-topstemwork14ncit%>%mutate(work=rownames(topstemwork14ncit))
topnstemwork14cit<-topnstemwork14cit%>%mutate(work=rownames(topnstemwork14cit))
topnstemwork14ncit<-topnstemwork14ncit%>%mutate(work=rownames(topnstemwork14ncit))
topwork14cit<-rbind(topstemwork14cit,topnstemwork14cit)
topwork14ncit<-rbind(topstemwork14ncit,topnstemwork14ncit)
topwork14cit[topwork14cit$work=="0",which(colnames(topwork14cit)=="work")]<-"NONE"
topwork14ncit[topwork14ncit$work=="0",which(colnames(topwork14ncit)=="work")]<-"NONE"
topwork14cit<-topwork14cit%>%mutate(proportion=frequency/sum(frequency))
topwork14ncit<-topwork14ncit%>%mutate(proportion=frequency/sum(frequency))
# topwork14cit,topwork14ncit will be used.
# Now, I am ready for Sankey plot.
msw14cit1<-ms14cittop[,2:4]
colnames(msw14cit1)<-c("origin","visit","weight")
msw14cit2<-topwork14cit[,2:4]
colnames(msw14cit2)<-c("origin","visit","weight")
msw14cit<-rbind(msw14cit1,msw14cit2)
msw14cit[,1]<-as.factor(msw14cit[,1])
msw14cit[,2]<-as.factor(msw14cit[,2])
msw14cit[,3]<-as.numeric(msw14cit[,3])
msw14ncit1<-ms14ncittop[,2:4]
colnames(msw14ncit1)<-c("origin","visit","weight")
msw14ncit2<-topwork14ncit[,2:4]
colnames(msw14ncit2)<-c("origin","visit","weight")
msw14ncit<-rbind(msw14ncit1,msw14ncit2)
msw14ncit[,1]<-as.factor(msw14ncit[,1])
msw14ncit[,2]<-as.factor(msw14ncit[,2])
msw14ncit[,3]<-as.numeric(msw14ncit[,3])
library(googleVis)
package ‘googleVis’ was built under R version 3.2.5Creating a generic function for ‘toJSON’ from package ‘jsonlite’ in package ‘googleVis’
Welcome to googleVis version 0.6.1
Please read the Google API Terms of Use
before you start using the package:
https://developers.google.com/terms/
Note, the plot method of googleVis will by default use
the standard browser to display its output.
See the googleVis package vignettes for more details,
or visit http://github.com/mages/googleVis.
To suppress this message use:
suppressPackageStartupMessages(library(googleVis))
plot(gvisSankey(msw14cit,from="origin",to="visit",weight="weight",options=list(height=700,width=1500,sankey="{link:{color:{fill:'deepskyblue'}},node:{width:20,label:{fontSize:12,bold:true}}}")))
plot(gvisSankey(msw14ncit,from="origin",to="visit",weight="weight",options=list(height=700,width=1500,sankey="{link:{color:{fill:'mediumspringgreen'}},node:{width:20,label:{fontSize:12,bold:true}}}")))
By now, I have finished the Sankey plot.
Next, I will do Circular plot.
# Now, I will do circular plots for majors in stem vs occupation and majors in non-stem vs occupation for citizens and non-citizens.
pus14citstem<-pus14cit[pus14cit$STEM==1,]
pus14citnstem<-pus14cit[pus14cit$STEM==0,]
pus14ncitstem<-pus14ncit[pus14ncit$STEM==1,]
pus14ncitnstem<-pus14ncit[pus14ncit$STEM==0,]
# each top major
pus14citstemtop<-list()
for(i in 1:10){
pus14citstemtop[[i]]<-pus14citstem%>%filter(FOD1P==as.numeric(ms14cittops$major[i]))
}
pus14citnstemtop<-list()
for(i in 1:10){
pus14citnstemtop[[i]]<-pus14citnstem%>%filter(FOD1P==as.numeric(ms14cittopns$major[i]))
}
pus14ncitstemtop<-list()
for(i in 1:10){
pus14ncitstemtop[[i]]<-pus14ncitstem%>%filter(FOD1P==as.numeric(ms14ncittops$major[i]))
}
pus14ncitnstemtop<-list()
for(i in 1:10){
pus14ncitnstemtop[[i]]<-pus14ncitnstem%>%filter(FOD1P==as.numeric(ms14ncittopns$major[i]))
}
# amount of jobs in each field for each top major
citstemf14<-list()
for(i in 1:10){
citstemf14[[i]]<-as.data.frame(sort(table(pus14citstemtop[[i]]$OCCP.TYPE),decreasing=T))
}
citnstemf14<-list()
for(i in 1:10){
citnstemf14[[i]]<-as.data.frame(sort(table(pus14citnstemtop[[i]]$OCCP.TYPE),decreasing=T))
}
ncitstemf14<-list()
for(i in 1:10){
ncitstemf14[[i]]<-as.data.frame(sort(table(pus14ncitstemtop[[i]]$OCCP.TYPE),decreasing=T))
}
ncitnstemf14<-list()
for(i in 1:10){
ncitnstemf14[[i]]<-as.data.frame(sort(table(pus14ncitnstemtop[[i]]$OCCP.TYPE),decreasing=T))
}
for(i in 1:10){
citstemf14[[i]]<-citstemf14[[i]]%>%mutate(type=rownames(citstemf14[[i]]))
citnstemf14[[i]]<-citnstemf14[[i]]%>%mutate(type=rownames(citnstemf14[[i]]))
ncitstemf14[[i]]<-ncitstemf14[[i]]%>%mutate(type=rownames(ncitstemf14[[i]]))
ncitnstemf14[[i]]<-ncitnstemf14[[i]]%>%mutate(type=rownames(ncitnstemf14[[i]]))
colnames(citstemf14[[i]])[1]<-"frequency"
colnames(citnstemf14[[i]])[1]<-"frequency"
colnames(ncitstemf14[[i]])[1]<-"frequency"
colnames(ncitnstemf14[[i]])[1]<-"frequency"
}
for(i in 1:10){
citstemf14[[i]]<-citstemf14[[i]][order(citstemf14[[i]]$type),]
citnstemf14[[i]]<-citnstemf14[[i]][order(citnstemf14[[i]]$type),]
ncitstemf14[[i]]<-ncitstemf14[[i]][order(ncitstemf14[[i]]$type),]
ncitnstemf14[[i]]<-ncitnstemf14[[i]][order(ncitnstemf14[[i]]$type),]
}
citstemm14<-matrix(0,nrow=10,ncol=25)
citnstemm14<-matrix(0,nrow=10,ncol=25)
ncitstemm14<-matrix(0,nrow=10,ncol=25)
ncitnstemm14<-matrix(0,nrow=10,ncol=25)
for(i in 1:10){
citstemm14[i,]<-citstemf14[[i]]$frequency[2:26]
citnstemm14[i,]<-citnstemf14[[i]]$frequency[2:26]
ncitstemm14[i,]<-ncitstemf14[[i]]$frequency[2:26]
ncitnstemm14[i,]<-ncitnstemf14[[i]]$frequency[2:26]
}
colnames(citstemm14)<-citstemf14[[1]]$type[2:26]
colnames(citnstemm14)<-citnstemf14[[1]]$type[2:26]
colnames(ncitstemm14)<-ncitstemf14[[1]]$type[2:26]
colnames(ncitnstemm14)<-ncitnstemf14[[1]]$type[2:26]
rownames(citstemm14)<-c(1,2,3,4,5,6,7,8,9,10)
rownames(citnstemm14)<-c(1,2,3,4,5,6,7,8,9,10)
rownames(ncitstemm14)<-c(1,2,3,4,5,6,7,8,9,10)
rownames(ncitnstemm14)<-c(1,2,3,4,5,6,7,8,9,10)
for(i in 1:10){
rownames(citstemm14)[i]<-codename(ms14cittops$major[i])
rownames(citnstemm14)[i]<-codename(ms14cittopns$major[i])
rownames(ncitstemm14)[i]<-codename(ms14ncittops$major[i])
rownames(ncitnstemm14)[i]<-codename(ms14ncittopns$major[i])
}
visualization #1 Citizen STEM
library(recharts)
zero1010<-matrix(0,10,10)
zero2525<-matrix(0,25,25)
citstemm14c1<-cbind(zero1010,citstemm14)
citstemm14c2<-cbind(t(citstemm14),zero2525)
citstemm14c<-rbind(citstemm14c1,citstemm14c2)
chordcitstem=list(title=list(text='Citizen STEM',subtext='From JavaScript',x='right',y='bottom'),
tooltip=list(trigger='item',formatter=JS('function(params){
if (params.indicator2) { // is edge
return params.value.weight;
} else {// is node
return params.name
}
}')),
toolbox=list(show=TRUE,feature=list(restore=list(show=TRUE),magicType=list(show=TRUE,type=c('force','chord')),
saveAsImage=list(show=TRUE))),
legend=list(x='left',data=c(rownames(citstemm14),colnames(citstemm14))),
series=list(list(type='chord',showScale=FALSE,showScaleText=FALSE,
data=list(list(name=rownames(citstemm14)[1]),list(name=rownames(citstemm14)[2]),list(name=rownames(citstemm14)[3]),list(name=rownames(citstemm14)[4]),list(name=rownames(citstemm14)[5]),list(name=rownames(citstemm14)[6]),list(name=rownames(citstemm14)[7]),list(name=rownames(citstemm14)[8]),list(name=rownames(citstemm14)[9]),list(name=rownames(citstemm14)[10]),list(name=colnames(citstemm14)[1]),list(name=colnames(citstemm14)[2]),list(name=colnames(citstemm14)[3]),list(name=colnames(citstemm14)[4]),list(name=colnames(citstemm14)[5]),list(name=colnames(citstemm14)[6]),list(name=colnames(citstemm14)[7]),list(name=colnames(citstemm14)[8]),list(name=colnames(citstemm14)[9]),list(name=colnames(citstemm14)[10]),list(name=colnames(citstemm14)[11]),list(name=colnames(citstemm14)[12]),list(name=colnames(citstemm14)[13]),list(name=colnames(citstemm14)[14]),list(name=colnames(citstemm14)[15]),list(name=colnames(citstemm14)[16]),list(name=colnames(citstemm14)[17]),list(name=colnames(citstemm14)[18]),list(name=colnames(citstemm14)[19]),list(name=colnames(citstemm14)[20]),list(name=colnames(citstemm14)[21]),list(name=colnames(citstemm14)[22]),list(name=colnames(citstemm14)[23]),list(name=colnames(citstemm14)[24]),list(name=colnames(citstemm14)[25])),
itemStyle=list(normal=list(label=list(show=TRUE))),
matrix=citstemm14c)))
echart(chordcitstem)
visualization #2 Citizen Non-STEM
zero1010<-matrix(0,10,10)
zero2525<-matrix(0,25,25)
citnstemm14c1<-cbind(zero1010,citnstemm14)
citnstemm14c2<-cbind(t(citnstemm14),zero2525)
citnstemm14c<-rbind(citnstemm14c1,citnstemm14c2)
chordcitnstem=list(title=list(text='Citizen non-STEM',subtext='From JavaScript',x='right',y='bottom'),
tooltip=list(trigger='item',formatter=JS('function(params){
if (params.indicator2) { // is edge
return params.value.weight;
} else {// is node
return params.name
}
}')),
toolbox=list(show=TRUE,feature=list(restore=list(show=TRUE),magicType=list(show=TRUE,type=c('force','chord')),
saveAsImage=list(show=TRUE))),
legend=list(x='left',data=c(rownames(citnstemm14),colnames(citnstemm14))),
series=list(list(type='chord',showScale=FALSE,showScaleText=FALSE,
data=list(list(name=rownames(citnstemm14)[1]),list(name=rownames(citnstemm14)[2]),list(name=rownames(citnstemm14)[3]),list(name=rownames(citnstemm14)[4]),list(name=rownames(citnstemm14)[5]),list(name=rownames(citnstemm14)[6]),list(name=rownames(citnstemm14)[7]),list(name=rownames(citnstemm14)[8]),list(name=rownames(citnstemm14)[9]),list(name=rownames(citnstemm14)[10]),list(name=colnames(citnstemm14)[1]),list(name=colnames(citnstemm14)[2]),list(name=colnames(citnstemm14)[3]),list(name=colnames(citnstemm14)[4]),list(name=colnames(citnstemm14)[5]),list(name=colnames(citnstemm14)[6]),list(name=colnames(citnstemm14)[7]),list(name=colnames(citnstemm14)[8]),list(name=colnames(citnstemm14)[9]),list(name=colnames(citnstemm14)[10]),list(name=colnames(citnstemm14)[11]),list(name=colnames(citnstemm14)[12]),list(name=colnames(citnstemm14)[13]),list(name=colnames(citnstemm14)[14]),list(name=colnames(citnstemm14)[15]),list(name=colnames(citnstemm14)[16]),list(name=colnames(citnstemm14)[17]),list(name=colnames(citnstemm14)[18]),list(name=colnames(citnstemm14)[19]),list(name=colnames(citnstemm14)[20]),list(name=colnames(citnstemm14)[21]),list(name=colnames(citnstemm14)[22]),list(name=colnames(citnstemm14)[23]),list(name=colnames(citnstemm14)[24]),list(name=colnames(citnstemm14)[25])),
itemStyle=list(normal=list(label=list(show=TRUE))),
matrix=citnstemm14c)))
echart(chordcitnstem)
visualization #3 Non-Citizen STEM
zero1010<-matrix(0,10,10)
zero2525<-matrix(0,25,25)
ncitstemm14c1<-cbind(zero1010,ncitstemm14)
ncitstemm14c2<-cbind(t(ncitstemm14),zero2525)
ncitstemm14c<-rbind(ncitstemm14c1,ncitstemm14c2)
chordncitstem=list(title=list(text='non-Citizen STEM',subtext='From JavaScript',x='right',y='bottom'),
tooltip=list(trigger='item',formatter=JS('function(params){
if (params.indicator2) { // is edge
return params.value.weight;
} else {// is node
return params.name
}
}')),
toolbox=list(show=TRUE,feature=list(restore=list(show=TRUE),magicType=list(show=TRUE,type=c('force','chord')),
saveAsImage=list(show=TRUE))),
legend=list(x='left',data=c(rownames(ncitstemm14),colnames(ncitstemm14))),
series=list(list(type='chord',showScale=FALSE,showScaleText=FALSE,
data=list(list(name=rownames(ncitstemm14)[1]),list(name=rownames(ncitstemm14)[2]),list(name=rownames(ncitstemm14)[3]),list(name=rownames(ncitstemm14)[4]),list(name=rownames(ncitstemm14)[5]),list(name=rownames(ncitstemm14)[6]),list(name=rownames(ncitstemm14)[7]),list(name=rownames(ncitstemm14)[8]),list(name=rownames(ncitstemm14)[9]),list(name=rownames(ncitstemm14)[10]),list(name=colnames(ncitstemm14)[1]),list(name=colnames(ncitstemm14)[2]),list(name=colnames(ncitstemm14)[3]),list(name=colnames(ncitstemm14)[4]),list(name=colnames(ncitstemm14)[5]),list(name=colnames(ncitstemm14)[6]),list(name=colnames(ncitstemm14)[7]),list(name=colnames(ncitstemm14)[8]),list(name=colnames(ncitstemm14)[9]),list(name=colnames(ncitstemm14)[10]),list(name=colnames(ncitstemm14)[11]),list(name=colnames(ncitstemm14)[12]),list(name=colnames(ncitstemm14)[13]),list(name=colnames(ncitstemm14)[14]),list(name=colnames(ncitstemm14)[15]),list(name=colnames(ncitstemm14)[16]),list(name=colnames(ncitstemm14)[17]),list(name=colnames(ncitstemm14)[18]),list(name=colnames(ncitstemm14)[19]),list(name=colnames(ncitstemm14)[20]),list(name=colnames(ncitstemm14)[21]),list(name=colnames(ncitstemm14)[22]),list(name=colnames(ncitstemm14)[23]),list(name=colnames(ncitstemm14)[24]),list(name=colnames(ncitstemm14)[25])),
itemStyle=list(normal=list(label=list(show=TRUE))),
matrix=ncitstemm14c)))
echart(chordncitstem)
visualization #4 Non-Citizen Non-STEM
zero1010<-matrix(0,10,10)
zero2525<-matrix(0,25,25)
ncitnstemm14c1<-cbind(zero1010,ncitnstemm14)
ncitnstemm14c2<-cbind(t(ncitnstemm14),zero2525)
ncitnstemm14c<-rbind(ncitnstemm14c1,ncitnstemm14c2)
chordncitnstem=list(title=list(text='non-Citizen non-STEM',subtext='From JavaScript',x='right',y='bottom'),
tooltip=list(trigger='item',formatter=JS('function(params){
if (params.indicator2) { // is edge
return params.value.weight;
} else {// is node
return params.name
}
}')),
toolbox=list(show=TRUE,feature=list(restore=list(show=TRUE),magicType=list(show=TRUE,type=c('force','chord')),
saveAsImage=list(show=TRUE))),
legend=list(x='left',data=c(rownames(ncitnstemm14),colnames(ncitnstemm14))),
series=list(list(type='chord',showScale=FALSE,showScaleText=FALSE,
data=list(list(name=rownames(ncitnstemm14)[1]),list(name=rownames(ncitnstemm14)[2]),list(name=rownames(ncitnstemm14)[3]),list(name=rownames(ncitnstemm14)[4]),list(name=rownames(ncitnstemm14)[5]),list(name=rownames(ncitnstemm14)[6]),list(name=rownames(ncitnstemm14)[7]),list(name=rownames(ncitnstemm14)[8]),list(name=rownames(ncitnstemm14)[9]),list(name=rownames(ncitnstemm14)[10]),list(name=colnames(ncitnstemm14)[1]),list(name=colnames(ncitnstemm14)[2]),list(name=colnames(ncitnstemm14)[3]),list(name=colnames(ncitnstemm14)[4]),list(name=colnames(ncitnstemm14)[5]),list(name=colnames(ncitnstemm14)[6]),list(name=colnames(ncitnstemm14)[7]),list(name=colnames(ncitnstemm14)[8]),list(name=colnames(ncitnstemm14)[9]),list(name=colnames(ncitnstemm14)[10]),list(name=colnames(ncitnstemm14)[11]),list(name=colnames(ncitnstemm14)[12]),list(name=colnames(ncitnstemm14)[13]),list(name=colnames(ncitnstemm14)[14]),list(name=colnames(ncitnstemm14)[15]),list(name=colnames(ncitnstemm14)[16]),list(name=colnames(ncitnstemm14)[17]),list(name=colnames(ncitnstemm14)[18]),list(name=colnames(ncitnstemm14)[19]),list(name=colnames(ncitnstemm14)[20]),list(name=colnames(ncitnstemm14)[21]),list(name=colnames(ncitnstemm14)[22]),list(name=colnames(ncitnstemm14)[23]),list(name=colnames(ncitnstemm14)[24]),list(name=colnames(ncitnstemm14)[25])),
itemStyle=list(normal=list(label=list(show=TRUE))),
matrix=ncitnstemm14c)))
echart(chordncitnstem)
From our first bar plot, we can notice that in recent 4 years, the employment rates for each category (citizens with STEM, citizens with non-STEM, non-citizens with STEM, non-citizens with non-STEM) almost have no volatility. The employment rate for citizens with STEM is around 85%, for citizens with non-STEM is around 80%, for non-citizens with STEM is 81%, and for non-citizens with non-stem is 68%. Through looking at the data above, we can say that compared with graduated students with non-STEM degrees, graduated students with STEM degrees have higher employment rates, especially for non-citizens. Perhaps, an important reason is that because of the extension of OPT by the STEM policy, international students with STEM degrees have enough time to make them legally stay in the United States after graduation.
From our second bar plot, we can see that for citizens with STEM degrees and for citizens with non-STEM degrees, levels of degree have little effects on the employment rates. However, for non-citizens with STEM degrees and for non-citizens with non-STEM degrees (especially for non-citizens with non-STEM degrees), levels of degree have effects on the employment rates. First, we can say that because of the STEM policy, the employment rate of each degree for non-citizens with non-STEM degree, the employment rate of each degree for non-citizens with STEM degree does not have really obvious difference with each other. Second, we can say that because of the STEM policy, the employment rate for non-citizens with STEM doctorate degree is almost the same as the employment rate for citizens with STEM doctorate degree.
From the sankey plots, we can find that the most popular STEM major for citizens is biology while for non-citizens is computer science and the most popular non-STEM major for both citizens and noncitizens is business management and administration. For STEM major citizens, after graduation, the top four fields are MGR MED CMM ENG, and for STEM major non-citizens, after graduation the top four fields are MGR SCI CMM ENG. For both non-STEM major citizens and non-STEM major non-citizens, after graduation, the top five fields are EDU OFF SAL MGR MED.
When we do the circular plot, we originally want to use circlize package in R, but we find that the plots from this package are not dynamic. Then, we are thinking whether we can do dynamic circular plot without directly using JavaScript. After searching, we find an interesting package recharts, which can be used to draw dynamic circular plots. Thanks to this package, we can derive the major vs working field mappings.
One problem we meet is that there is no STEM/non-STEM classification for each major. Therefore, we go to the official website for STEM/non-STEM classifications and finally decide the classification for each major.
The other problem is that which variables we should use during our analysis. For example, which variables can reflect the employment status? As for this, we decide to study the employment status of current year, then we use the variable OCCP.
In this part of analysis, we want to know how STEM jobs has changed along with the announcement of the STEM policy. Two data sets are being used: data of 2007, which is the last year before STEM policy, and data of 2014, the latest obtainable under-STEM-policy data. For each data set, we categorize our data into two classes: STEM-job data and non-STEM-job data. By doing the separations, we can look at the differences of STEM and non-STEM jobs before and after STEM policy went out.
Will the STEM/non-STEM job allocation change?
Among STEM/non-STEM jobs, will gender proportion and class of work be influenced by the STEM policy?
How did the working conditions, such as wage and working hours change?
Will the STEM policy further influence people’s immigration behavior?
library(data.table)
library(dplyr)
library(ggplot2)
library(fmsb)
library(choroplethr)
library(choroplethrMaps)
# read data
cols <- c("WAGP", "ST", "AGEP", "ESR", "CIT", "YOEP", "COW", "SEX" , "WKHP","WKW","SOCP","POWSP","POVPIP","SCHL")
before_data11<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus(2007)/ss07pusa.csv", select=cols)
Read 0.0% of 1549929 rows
Bumped column 106 to type character on data row 17, field contains '37201X'. Coercing previously read values in this column from logical, integer or numeric back to character which may not be lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If reporting to datatable-help, please rerun and include the output from verbose=TRUE.
Read 21.9% of 1549929 rows
Read 44.5% of 1549929 rows
Read 66.5% of 1549929 rows
Read 87.7% of 1549929 rows
Read 1549929 rows and 14 (of 239) columns from 1.112 GB file in 00:00:08
before_data12<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus(2007)/ss07pusa.csv", select=c(160:239))
Read 0.0% of 1549929 rows
Read 17.4% of 1549929 rows
Read 34.8% of 1549929 rows
Read 51.6% of 1549929 rows
Read 69.7% of 1549929 rows
Read 87.1% of 1549929 rows
Read 1549929 rows and 80 (of 239) columns from 1.112 GB file in 00:00:08
before_data21<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus(2007)/ss07pusb.csv", select=cols)
Read 0.0% of 1444733 rows
Read 24.9% of 1444733 rows
Read 49.8% of 1444733 rows
Read 74.1% of 1444733 rows
Read 99.7% of 1444733 rows
Read 1444733 rows and 14 (of 239) columns from 1.037 GB file in 00:00:07
before_data22<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus(2007)/ss07pusb.csv", select=c(160:239))
Read 2.1% of 1444733 rows
Read 21.5% of 1444733 rows
Read 40.1% of 1444733 rows
Read 58.8% of 1444733 rows
Read 77.5% of 1444733 rows
Read 96.2% of 1444733 rows
Read 1444733 rows and 80 (of 239) columns from 1.037 GB file in 00:00:08
before_data1 <- cbind(before_data11, before_data12)
before_data2 <- cbind(before_data21, before_data22)
before_data<- subset(rbind(before_data1, before_data2))
after_data11<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus/ss14pusa.csv", select=cols)
Read 0.0% of 1611956 rows
Bumped column 127 to type character on data row 7, field contains '37201X'. Coercing previously read values in this column from logical, integer or numeric back to character which may not be lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please note that column type detection uses the first 5 rows, the middle 5 rows and the last 5 rows, so hopefully this message should be very rare. If reporting to datatable-help, please rerun and include the output from verbose=TRUE.
Read 17.4% of 1611956 rows
Read 35.4% of 1611956 rows
Read 54.0% of 1611956 rows
Read 72.6% of 1611956 rows
Read 91.2% of 1611956 rows
Read 1611956 rows and 14 (of 284) columns from 1.427 GB file in 00:00:09
after_data12<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus/ss14pusa.csv", select=c(205:284))
Read 0.0% of 1611956 rows
Read 14.3% of 1611956 rows
Read 27.9% of 1611956 rows
Read 41.6% of 1611956 rows
Read 55.2% of 1611956 rows
Read 68.9% of 1611956 rows
Read 82.5% of 1611956 rows
Read 96.2% of 1611956 rows
Read 1611956 rows and 80 (of 284) columns from 1.427 GB file in 00:00:11
after_data21<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus/ss14pusb.csv", select=cols)
Read 0.0% of 1520654 rows
Read 19.7% of 1520654 rows
Read 38.8% of 1520654 rows
Read 57.9% of 1520654 rows
Read 76.9% of 1520654 rows
Read 96.0% of 1520654 rows
Read 1520654 rows and 14 (of 284) columns from 1.346 GB file in 00:00:09
after_data22<-fread("/Users/YaqingXie/Desktop/Applied Data Science/Week1/csv_pus/ss14pusb.csv", select=c(205:284))
Read 0.0% of 1520654 rows
Read 15.1% of 1520654 rows
Read 29.6% of 1520654 rows
Read 44.1% of 1520654 rows
Read 58.5% of 1520654 rows
Read 73.7% of 1520654 rows
Read 88.1% of 1520654 rows
Read 1520654 rows and 80 (of 284) columns from 1.346 GB file in 00:00:10
after_data1 <- cbind(after_data11, after_data12)
after_data2 <- cbind(after_data21, after_data22)
after_data<- subset(rbind(after_data1, after_data2))
#filter data: only people currenlty employeed in US
before_data<-before_data[!(before_data$ESR %in% c(3,6,NA))]
before_data<-before_data[before_data$POWSP >= 1 & before_data$POWSP <= 56]
after_data<-after_data[!(after_data$ESR %in% c(3,6,NA))]
after_data<-after_data[after_data$POWSP >= 1 & after_data$POWSP <= 56]
# job code for STEM occupations
# source: http://www.bls.gov/soc/Attachment_C_STEM.pdf
before_stem_job_codes = c('113021','119041','119111','119121','151021','151030','151041','151061','151071','151081','1510XX','152011','152031','1520XX','171010','171020','172011','172041','172051','172061','172070','172081','1720XX','172110','172121','172131','172141','1721XX','1721YY','173010','173020','173031','191010','191020','191030','191040','192010','192021','192030','192040','192099','193011','193020','193030','193051','1930XX','194011','194021','194031','194041','1940XX','254010','291011','291020','291031','291041','291051','291060','291071','291081','291111','291121','291122','291123','291124','291125','291126','291127','291129','291131','291199','292010','292021','292030','292041','292050','292061','292071','292081','292090','299000','414010','419031')
after_stem_job_codes = c('113021','119041','119121','151111','151121','151122','151131','151132','151133','151134','151141','151142','151143','151151','151152','151199','152011','152021','152031','152041','152099','171021','171022','172011','172021','172031','172041','172051','172061','172071','172072','172081','172111','172112','172121','172131','172141','172151','172161','172171','172199','173012','173013','173019','173021','173022','173023','173024','173025','173026','173027','173029','173031','191011','191012','191012','191021','191022','191023','191029','191031','191032','191041','191042','191099','192011','192012','192021','192031','192032','192041','192042','192043','192099','194011','194021','194031','194041','194051','194091','194092','194093','251021','251022','251032','251041','251042','251043','251051','251052','251053','251054','414011','419031')
# categorize job into stem and non-stem
before_data$SOCP[!(before_data$SOCP %in% before_stem_job_codes) & (before_data$SOCP!="")] <- "NON-STEM"
before_data$SOCP[before_data$SOCP %in% before_stem_job_codes] <- "STEM"
after_data$SOCP[!(after_data$SOCP %in% after_stem_job_codes) & (after_data$SOCP!="")] <- "NON-STEM"
after_data$SOCP[after_data$SOCP %in% after_stem_job_codes] <- "STEM"
# rename gender
before_data$SEX[before_data$SEX == 1] <- "Male"
before_data$SEX[before_data$SEX == 2] <- "Female"
after_data$SEX[after_data$SEX == 1] <- "Male"
after_data$SEX[after_data$SEX == 2] <- "Female"
# rename class of worker
before_data$COW[before_data$COW == 1] <- "For-profit Company"
before_data$COW[before_data$COW == 2] <- "Not-for-profit Organization"
before_data$COW[before_data$COW == 3] <- "Local Government"
before_data$COW[before_data$COW == 4] <- "State Government"
before_data$COW[before_data$COW == 5] <- "Federal Government"
before_data$COW[before_data$COW == 6] <- "Self-employeed & Not Incorporated"
before_data$COW[before_data$COW == 7] <- "Self-employeed & Incorporated"
before_data$COW[before_data$COW == 8] <- "Family Business"
after_data$COW[after_data$COW == 1] <- "For-profit Company"
after_data$COW[after_data$COW == 2] <- "Not-for-profit Organization"
after_data$COW[after_data$COW == 3] <- "Local Government"
after_data$COW[after_data$COW == 4] <- "State Government"
after_data$COW[after_data$COW == 5] <- "Federal Government"
after_data$COW[after_data$COW == 6] <- "Self-employeed & Not Incorporated"
after_data$COW[after_data$COW == 7] <- "Self-employeed & Incorporated"
after_data$COW[after_data$COW == 8] <- "Family Business"
# recode number of weeks worked in the past year
after_data$WKW[after_data$WKW==1] <- 51
after_data$WKW[after_data$WKW==2] <- 48.5
after_data$WKW[after_data$WKW==3] <- 43.5
after_data$WKW[after_data$WKW==4] <- 33
after_data$WKW[after_data$WKW==5] <- 20
after_data$WKW[after_data$WKW==6] <- 7
# plotting
# 1 POWSP-stem/nonstem-before/after MAP
before_stem <- before_data[before_data$SOCP == 'STEM']
before_state_stem <- subset(before_stem, select=c("ST","POWSP"))
before_nonstem <- before_data[before_data$SOCP == 'NON-STEM']
before_state_nonstem <- subset(before_nonstem, select=c("ST","POWSP"))
after_stem <- after_data[after_data$SOCP == 'STEM']
after_state_stem <- subset(after_stem, select=c("ST","POWSP"))
after_nonstem <- after_data[after_data$SOCP == 'NON-STEM']
after_state_nonstem <- subset(after_nonstem, select=c("ST","POWSP"))
data("state.regions")
statenames <- data.frame(cbind(state.regions[,1],state.regions[,3]))
names(statenames) <- c('region', 'region_code')
getstate <- function(vector){
temp <- data.frame(vector)
names(temp) <- c("region_code")
new_dataframe <- merge(temp,statenames, by.x="region_code", by.y='region_code')
new_dataframe <- table(new_dataframe$region)
new_dataframe <- data.frame(new_dataframe)
names(new_dataframe) <- c('region','value')
return(new_dataframe)
}
before_stem_ST <- getstate(before_state_stem$ST)
before_stem_POW <- getstate(before_state_stem$POWSP)
before_nonstem_ST <- getstate(before_state_nonstem$ST)
before_nonstem_POW <- getstate(before_state_nonstem$POWSP)
after_stem_ST <- getstate(after_state_stem$ST)
after_stem_POW <- getstate(after_state_stem$POWSP)
after_nonstem_ST <- getstate(after_state_nonstem$ST)
after_nonstem_POW <- getstate(after_state_nonstem$POWSP)
state_choropleth(before_nonstem_POW,
title = "Before STEM Policy: NON-STEM Job Allocation",
legend = "Number of Occupations",
num_colors = 1)
joining factor and character vector, coercing into character vector
state_choropleth(after_nonstem_POW,
title = "After STEM Policy: NON-STEM Job Allocation",
legend = "Number of Occupations",
num_colors = 1)
joining factor and character vector, coercing into character vector
state_choropleth(before_stem_POW,
title = "Before STEM Policy: STEM Job Allocation",
legend = "Number of Occupations",
num_colors = 1)
joining factor and character vector, coercing into character vector
state_choropleth(after_stem_POW,
title = "After STEM Policy: STEM Job Allocation",
legend = "Number of Occupations",
num_colors = 1)
joining factor and character vector, coercing into character vector
According to these two figures, we can see that the geographical allocation of non-stem jobs barely changed after STEM policy went out.
However, if we have a close look at STEM jobs, we realized that relevant jobs in NY an FL tended to diffuse towards neighboring states.
state_choropleth(before_stem_POW,
title = "Before STEM Policy: STEM Job Allocation",
legend = "Number of Occupations",
num_colors = 1,
zoom = c("new york","pennsylvania","virginia","ohio","north carolina","south carolina","georgia","florida"))
joining factor and character vector, coercing into character vector
state_choropleth(after_stem_POW,
title = "After STEM Policy: STEM Job Allocation",
legend = "Number of Occupations",
num_colors = 1,
zoom = c("new york","pennsylvania","virginia","ohio","north carolina","south carolina","georgia","florida"))
joining factor and character vector, coercing into character vector
Among these states, VA has the most significant increase in the amount of STEM jobs.
In general, the state allocation of jobs didn’t change dramatically after the STEM policy went out. It could be plausible since STEM policy is in effect for foreign students while the job allocation depends more on the locational choices of different companies.
# 2 wage&wkhp&wkw&povpip&schl-stem/nonstem-before/after
# survey weight
library(survey)
package ‘survey’ was built under R version 3.2.5Loading required package: grid
Loading required package: Matrix
Loading required package: survival
Attaching package: ‘survey’
The following object is masked from ‘package:graphics’:
dotchart
#df_before<-svrepdesign(variables=before_data[,1:16],
# repweights=before_data[,17:86], type="BRR",combined.weights=TRUE,
# weights=before_data$PWGTP)
#summary(df_before)
#svymean(~ WAGP,df_before, na.rm = T)
#df_after<-svrepdesign(variables=after_data[,1:16],
# repweights=after_data[,17:86], type="BRR",combined.weights=TRUE,
# weights=after_data$PWGTP)
#summary(df_before)
#svymean(~ WAGP,df_before, na.rm = T)
radardata <- data.frame(group = character(), wage = numeric(), hr_per_wk = numeric, wk_per_yr = numeric(), income_to_poverty = numeric(), degree = numeric())
radardata <- rbind(radardata, data.frame(group = "before_stem", wage = mean(before_stem$WAGP), hr_per_wk=mean(before_stem$WKHP), wk_per_yr=mean(before_stem$WKW), income_to_poverty=mean(before_stem$POVPIP,na.rm=TRUE), degree=mean(before_stem$SCHL)))
radardata <- rbind(radardata, data.frame(group = "before_nonstem", wage = mean(before_nonstem$WAGP), hr_per_wk=mean(before_nonstem$WKHP), wk_per_yr=mean(before_nonstem$WKW), income_to_poverty=mean(before_nonstem$POVPIP,na.rm=TRUE), degree=mean(before_nonstem$SCHL)))
radardata <- rbind(radardata, data.frame(group = "after_stem", wage = mean(after_stem$WAGP), hr_per_wk=mean(after_stem$WKHP), wk_per_yr=mean(after_stem$WKW), income_to_poverty=mean(after_stem$POVPIP,na.rm=TRUE), degree=mean(after_stem$SCHL)))
radardata <- rbind(radardata, data.frame(group = "after_nonstem", wage = mean(after_nonstem$WAGP), hr_per_wk=mean(after_nonstem$WKHP), wk_per_yr=mean(after_nonstem$WKW), income_to_poverty=mean(after_nonstem$POVPIP,na.rm=TRUE), degree=mean(after_nonstem$SCHL)))
radardata$wage <- radardata$wage / 1000
radardata2 <- rbind(c(max(radardata$wage),max(radardata$hr_per_wk),max(radardata$wk_per_yr),max(radardata$income_to_poverty),max(radardata$degree)),
c(min(radardata$wage),min(radardata$hr_per_wk),min(radardata$wk_per_yr),min(radardata$income_to_poverty),min(radardata$degree)),
radardata[,c(2:6)])
colors_border=c(rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9), rgb(0.5,0.2,0.1,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.4), rgb(0.8,0.2,0.5,0.4) , rgb(0.7,0.5,0.1,0.4), rgb(0.5,0.2,0.1,0.4) )
radarchart(radardata2 , axistype=1 ,
# custom polygon
pcol=colors_border , pfcol=colors_in , plwd=3 , plty=1,
# custom the grid
cglcol="grey", cglty=2, axislabcol="grey",
caxislabels = c('Min ','','','','Max '),
cglwd=0.5
)
legend(x=0.9, y=1.4, legend = radardata[,1], bty = "n", pch=20 , col=colors_border , cex=0.9, pt.cex=1)
title("Changes in Working Conditions After STEM Policy Went Out ", cex.main=1)
If we try to address the gender proportion in jobs, we get some more interesting findings. First of all, the gender equality seems to be quite tenable in non-stem jobs. However, when speaking of stem jobs, the percentage of female employees dropped significantly after STEM policy went out. What could the potential reasons be? If it is relevant to STEM policy, does this mean that STEM policy is more appealing to male foreign students compared to female? Yet, if we consider the fact that the proportion of non-US-born employees in the STEM industries is actually not that big, we should realize that STEM policy could not be the major cause. In other words, there’re some other factors that are currently influencing the gender structure in STEM industries, and we should pay attention to this huge gender gap.
While the structure of class of work remains almost the same after STEM policy went out, we could tell the difference between STEM industry structures with and without STEM policy. Compared to non-stem industries, less jobs are of self-employed or for Not-for-profit organizations. Instead, STEM people tend to stay at for-profit companies. This change could be relevant to both the structure change of the STEM industries and the thought conversion of STEM people. It’s likely that for these people, for-profit company is the option which stands for self-fulfillment.
# 3 cow, sex
cow <- data.frame(table(before_stem$COW))
cow <- cbind(cow,table(before_nonstem$COW))
cow <- cbind(cow,table(after_stem$COW))
cow <- cbind(cow,table(after_nonstem$COW))
cow <- cow[,c(1,2,4,6,8)]
names(cow) <- c('Class', 'before_stem', 'before_nonstem','after_stem','after_nonstem')
cow[,2] = cow[,2]/sum(cow[,2])
cow[,3] = cow[,3]/sum(cow[,3])
cow[,4] = cow[,4]/sum(cow[,4])
cow[,5] = cow[,5]/sum(cow[,5])
sex <- data.frame(table(before_stem$SEX))
sex <- cbind(sex,table(before_nonstem$SEX))
sex <- cbind(sex,table(after_stem$SEX))
sex <- cbind(sex,table(after_nonstem$SEX))
sex <- sex[,c(1,2,4,6,8)]
names(sex) <- c('Sex', 'before_stem', 'before_nonstem','after_stem','after_nonstem')
sex[,2] = sex[,2]/sum(sex[,2])
sex[,3] = sex[,3]/sum(sex[,3])
sex[,4] = sex[,4]/sum(sex[,4])
sex[,5] = sex[,5]/sum(sex[,5])
cow_melt = melt(cow, id.vars = c('Class'))
ggplot(cow_melt,aes(x = variable, y = value,fill = Class)) +
geom_bar(position = "fill",stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Changes in Class of Work After STEM Policy Went Out") +
theme(plot.title = element_text(size = rel(1),face="bold")) +
xlab("") +
ylab("Component Proportion")
colors_in1=c('rosybrown1', 'lightsteelblue2')
colors_in2=c('rosybrown3', 'lightsteelblue4')
colors_in3=c('tan1','seagreen1')
colors_in4=c('tan3','seagreen3')
colors_in=c('rosybrown1', 'lightsteelblue2','rosybrown3', 'lightsteelblue4')
pie(x=c(sex[1,2],sex[2,2]),labels=c(""),cex=1,radius=1,col=colors_in1)
par(new=TRUE)
pie(x=c(sex[1,3],sex[2,3]),labels=c(""), cex=0.8,radius=0.75,col=colors_in2)
par(new=TRUE)
pie(x=c(sex[1,4],sex[2,4]),labels=c(""),cex=1,radius=0.5,col=colors_in3)
par(new=TRUE)
pie(x=c(sex[1,5],sex[2,5]),labels=c(""),cex=0.8,radius=0.25,col=colors_in4)
legend(x=1.5, y=0.5, legend = c('',''), col = colors_in1, bty = "n", pch=20 ,cex=0.8, pt.cex=2)
legend(x=1.9, y=0.5, legend = c('',''), col = colors_in3, bty = "n", pch=20 ,cex=0.8, pt.cex=2)
text('2007', x=1.6, y=0.6,cex=0.8)
text('2014', x=2, y=0.6,cex=0.8)
text('female', x=1.2, y=0.4,cex=0.8)
text('male', x=1.2, y=0.25,cex=0.8)
legend(x=1.5,y=0.,legend=c('',''),col=c('white','grey'),bty = "n", pch=20 ,cex=0.8, pt.cex=2)
legend(x=1.475,y=0.05,pt.cex=1.4, bty = "n", legend = c(""), pch = 21)
text('stem', x=1.2, y=-0.11,cex=0.8)
text('non-stem', x=1.25, y=-0.25,cex=0.8)
title("Changes in Gender Proportion of Jobs After STEM Policy Went Out")
STEM jobs used to be kind of jobs in which you need to pay more in order to get more. The good thing was, you didn’t need to have an advanced degree to get such kind of jobs. After STEM policy went out, the requirement of degree in STEM jobs raised sharply. Though it’s actually the same case in non-STEM jobs, STEM jobs are still the group which requires the highest average degree attainment.
We can also read more information from this figure, such as, currently, the better wage/degree performance appears to STEM jobs instead of non-STEM jobs and the gap is quite obvious. However, speaking of wage/working hours, the answer could be the opposite. But this doesn’t mean that non-STEM job is a better choice since you are not the one who decides how many hours you work.
# 4 year of entry-stem/nonstem-before/after (for those who naturalized)
before_naturalize <- before_data[before_data$CIT == 4]
after_naturalize <- after_data[after_data$CIT == 4]
before_naturalize$AGEOFENTRY <- before_naturalize$YOEP - (2007 - before_naturalize$AGEP)
after_naturalize$AGEOFENTRY <- after_naturalize$YOEP - (2014 - after_naturalize$AGEP)
before_naturalize_stem <- before_naturalize[before_naturalize$SOCP == 'STEM']
before_naturalize_nonstem <- before_naturalize[before_naturalize$SOCP == 'NON-STEM']
after_naturalize_stem <- after_naturalize[after_naturalize$SOCP == 'STEM']
after_naturalize_nonstem <- after_naturalize[after_naturalize$SOCP == 'NON-STEM']
before_naturalize_age_stem <- cut(before_naturalize_stem$AGEOFENTRY, seq(0,100,length.out=21))
before_naturalize_age_stem <- data.frame(table(before_naturalize_age_stem))
before_naturalize_age_nonstem <- cut(before_naturalize_nonstem$AGEOFENTRY, seq(0,100,length.out=21))
before_naturalize_age_nonstem <- data.frame(table(before_naturalize_age_nonstem))
after_naturalize_age_stem <- cut(after_naturalize_stem$AGEOFENTRY, seq(0,100,length.out=21))
after_naturalize_age_stem <- data.frame(table(after_naturalize_age_stem))
after_naturalize_age_nonstem <- cut(after_naturalize_nonstem$AGEOFENTRY, seq(0,100,length.out=21))
after_naturalize_age_nonstem <- data.frame(table(after_naturalize_age_nonstem))
naturalized_age_stem <- merge(before_naturalize_age_stem,after_naturalize_age_stem, by.x="before_naturalize_age_stem", by.y='after_naturalize_age_stem')
names(naturalized_age_stem) <- c('age_range','before_stem','after_stem')
naturalized_age_stem$before_stem = naturalized_age_stem$before_stem/sum(naturalized_age_stem$before_stem)
naturalized_age_stem$after_stem = naturalized_age_stem$after_stem/sum(naturalized_age_stem$after_stem)
naturalized_age_nonstem <- merge(before_naturalize_age_nonstem,after_naturalize_age_nonstem, by.x="before_naturalize_age_nonstem", by.y='after_naturalize_age_nonstem')
names(naturalized_age_nonstem) <- c('age_range','before_nonstem','after_nonstem')
naturalized_age_nonstem$before_nonstem = naturalized_age_nonstem$before_nonstem/sum(naturalized_age_nonstem$before_nonstem)
naturalized_age_nonstem$after_nonstem = naturalized_age_nonstem$after_nonstem/sum(naturalized_age_nonstem$after_nonstem)
naturalized_age <- cbind(naturalized_age_stem,naturalized_age_nonstem)
naturalized_age <- naturalized_age[,c(1,2,5,3,6)]
naturalized_age <- melt(naturalized_age, id='age_range')
ggplot(data=naturalized_age,
aes(x=age_range, y=value, col=variable, group=variable)) +
geom_line()+
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Changes in Age of Entry Among Immigrants") +
theme(plot.title = element_text(size = rel(1),face="bold")) +
ylab("Proportion")
If we assume the STEM policy does attract a lot of foreign students to come and study in US, does it mean that this policy will further influence them to stay in US permanently? From the figure above, we know the answer is NO. The figure generally presents the distribution of age of entry among immigrants to US. We can see from the plot that the proportion of immigrants who came to US during their age of schooling didn’t change much as the STEM policy was published, either for STEM or non-STEM people.